#include <iostream>
#include <opencv2/core.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>

static cv::Mat _my_simple_remap(const cv::Mat& mat, const cv::Mat& mx, const cv::Mat& my) {
    cv::Mat dst(mat.size(), mat.type());
    for (int i = 0; i < mat.rows; i++) {
        for (int j = 0; j < mat.cols; j++) {
            int x = (int)mx.ptr<float>(i)[j];
            int y = (int)my.ptr<float>(i)[j];
            dst.ptr<uchar>(i)[j] = mat.ptr<uchar>(y)[x];
        }
    }
    return dst;
}

static bool _is_same(const cv::Mat& m1, const cv::Mat& m2) {
    CV_Assert(m1.type() == CV_8UC1);
    CV_Assert(m2.type() == CV_8UC1);
    cv::Mat diff = m1 != m2;
    std::vector<cv::Point> nonzeros;
    cv::findNonZero(diff, nonzeros);
    return nonzeros.size() == 0;
}

static int _clip(int x, int a, int b) {
    if (x >= a) {
        if (x < b) {
            return x;
        }
        return b-1;
    }
    return a;
}

static int _border_interpolate(int p, int len, int borderType) {
    if (p >= 0 && p < len) {
        // nothing
    } else if (borderType == cv::BORDER_REPLICATE) {
        p = (p < 0 ? 0 : len-1);
    } else if (borderType == cv::BORDER_REFLECT) {
        if (len == 1) {
            return 0;
        }
        do {
            if (p < 0) {
                p = -p-1;
            } else {
                p = len-1-(p-len);
            }
        } while (p < 0 || p >= len);
    } else if (borderType == cv::BORDER_REFLECT_101) {
        if (len == 1) {
            return 0;
        }
        do {
            if (p < 0) {
                p = -p-1+1;
            } else {
                p = len-1-(p-len)-1;
            }
        } while (p < 0 || p >= len);
    } else if (borderType == cv::BORDER_WRAP) {
        if (p < 0) {
            p -= (p-len+1);
        }
        if (p >= len) {
            p %= len;
        }
    } else {  // cv::BORDER_CONSTANT
        p = -1;
    }
    return p;
}

static void _remap_nearest(const cv::Mat& src, cv::Mat& dst, const cv::Mat& mapx, const cv::Mat& mapy, int borderType, uchar borderValue) {
    for (int i = 0; i < dst.rows; i++) {
        uchar* pdst = dst.ptr<uchar>(i);
        const float* px = mapx.ptr<float>(i);
        const float* py = mapy.ptr<float>(i);
        for (int j = 0; j < dst.cols; j++) {
            int sx = (int)px[j];
            int sy = (int)py[j];
            if (sx >= 0 && sx < src.cols && sy >= 0 && sy < src.rows) {
                pdst[j] = src.ptr<uchar>(sy)[sx];
            } else {
                if (borderType == cv::BORDER_REPLICATE) {
                    sx = _clip(sx, 0, src.cols);
                    sy = _clip(sy, 0, src.rows);
                    pdst[j] = src.ptr<uchar>(sy)[sx];
                } else if (borderType == cv::BORDER_CONSTANT) {
                    pdst[j] = borderValue;
                } else if (borderType != cv::BORDER_TRANSPARENT) {
                    sx = _border_interpolate(sx, src.cols, borderType);
                    sy = _border_interpolate(sy, src.rows, borderType);
                    pdst[j] = src.ptr<uchar>(sy)[sx];
                }
            }
        }
    }
}

static void _linear_interpolate_coeffs(float x, float& coeff1, float& coeff2) {
    coeff1 = 1.f - x;
    coeff2 = x;
}

static void _init_1d_linear_interpolate_grid(int gridsize, std::vector<float>& coeffs1, std::vector<float>& coeffs2) {
    float scale = 1.f / gridsize;
    coeffs1.resize(gridsize);
    coeffs2.resize(gridsize);
    for (int i = 0; i < gridsize; i++) {
        float x = i * scale;
        float coeff1, coeff2;
        _linear_interpolate_coeffs(x, coeff1, coeff2);
        coeffs1[i] = coeff1;
        coeffs2[i] = coeff2;
    }
}

static std::vector<std::vector<cv::Mat>> _init_2d_linear_interpolate_grid() {
    int gridsize = cv::INTER_TAB_SIZE;  // 32
    std::vector<float> coeffs1, coeffs2;
    _init_1d_linear_interpolate_grid(gridsize, coeffs1, coeffs2);
    std::vector<std::vector<float>> coeffs;
    coeffs.push_back(coeffs1);
    coeffs.push_back(coeffs2);
    
    int bits = 15;
    int scale = (1 << bits);
    
    int ksize = 2;
    std::vector<std::vector<cv::Mat>> grids;
    for (int i = 0; i < gridsize; i++) {
        std::vector<cv::Mat> gridrow;
        for (int j = 0; j < gridsize; j++) {
            int isum = 0;
            cv::Mat grid(ksize, ksize, CV_16SC1);
            
            for (int k1 = 0; k1 < ksize; k1++) {
                float vy = coeffs[k1][i];
                for (int k2 = 0; k2 < ksize; k2++) {
                    float vx = coeffs[k2][j];
                    float v = vy * vx;
                    short val = cv::saturate_cast<short>(v * scale);
                    grid.ptr<short>(k1)[k2] = val;
                    isum += val;
                }
            }
            
            if (isum != scale) {
                int diff = isum - scale;
                if (diff < 0) {
                    short val = grid.ptr<short>(ksize-1)[ksize-1];
                    grid.ptr<short>(ksize-1)[ksize-1] = (short)(val - diff);
                }
            }
            gridrow.push_back(grid);
        }
        grids.push_back(gridrow);
    }
    
    return grids;
}

static uchar _fixed_pt_cast(int val) {
    int bits = 15;
    int shift = bits;
    int delta = 1 << (bits - 1);
    return cv::saturate_cast<uchar>((val + delta) >> shift);
}

static void _remap_bilinar(const cv::Mat& src, cv::Mat& dst, const cv::Mat& mapx, const cv::Mat& mapy, const std::vector<std::vector<cv::Mat>>& grids, const cv::Mat& gridy_indices, const cv::Mat& gridx_indices, int borderType, uchar borderValue) {
    CV_Assert(src.type() == CV_8UC1);
    CV_Assert(mapx.type() == CV_16SC1);
    CV_Assert(mapy.type() == CV_16SC1);
    CV_Assert(gridy_indices.type() == CV_16UC1);
    CV_Assert(gridx_indices.type() == CV_16UC1);
    
    const int KSIZE = 2;
    for (int i = 0; i < dst.rows; i++) {
        uchar* pdst = dst.ptr<uchar>(i);
        const short* px = mapx.ptr<short>(i);
        const short* py = mapy.ptr<short>(i);
        
        for (int j = 0; j < dst.cols; j++) {
            int sx = (int)px[j];
            int sy = (int)py[j];
            ushort gridy_index = gridy_indices.ptr<ushort>(i)[j];
            ushort gridx_index = gridx_indices.ptr<ushort>(i)[j];
            cv::Mat w = grids[gridy_index][gridx_index];
            
            if ((unsigned)sx < src.cols - (KSIZE-1) && (unsigned)sy < src.rows - (KSIZE-1)) {
                int isum = 0;
                for (int ky = 0; ky < KSIZE; ky++) {
                    for (int kx = 0; kx < KSIZE; kx++) {
                        isum += int(w.ptr<short>(ky)[kx] * src.ptr<uchar>(sy+ky)[sx+kx]);
                    }
                }
                pdst[j] = _fixed_pt_cast(isum);
            } else {
                if (borderType == cv::BORDER_CONSTANT && (sx >= src.cols || sx + KSIZE <= 0 || sy >= src.rows || sy + KSIZE <= 0)) {
                    pdst[j] = borderValue;
                } else {
                    int isum = 0;
                    if (borderType == cv::BORDER_REPLICATE) {
                        int sxs[KSIZE], sys[KSIZE];
                        for (int ks = 0; ks < KSIZE; ks++) {
                            sxs[ks] = _clip(sx + ks, 0, src.cols);
                            sys[ks] = _clip(sy + ks, 0, src.rows);
                        }
                        for (int ky = 0; ky < KSIZE; ky++) {
                            int yky = sys[ky];
                            for (int kx = 0; kx < KSIZE; kx++) {
                                int xkx = sxs[kx];
                                isum += int(src.ptr<uchar>(yky)[xkx] * w.ptr<short>(ky)[kx]);
                            }
                        }
                    } else {
                        int sxs[KSIZE], sys[KSIZE];
                        for (int ks = 0; ks < KSIZE; ks++) {
                            sxs[ks] = _border_interpolate(sx+ks, src.cols, borderType);
                            sys[ks] = _border_interpolate(sy+ks, src.rows, borderType);
                        }
                        for (int ky = 0; ky < KSIZE; ky++) {
                            int yky = sys[ky];
                            
                            for (int kx = 0; kx < KSIZE; kx++) {
                                int xkx = sxs[kx];
                                if (yky < 0) {
                                    isum += borderValue * w.ptr<short>(ky)[kx];
                                    continue;
                                }
                                if (xkx < 0) {
                                    isum += borderValue * w.ptr<short>(ky)[kx];
                                    continue;
                                }
                                isum += int(src.ptr<uchar>(yky)[xkx] * w.ptr<short>(ky)[kx]);
                            }
                        }
                    }
                    pdst[j] = _fixed_pt_cast(isum);
                }
            }
        }
    }
}

static void _cubic_interpolate_coeffs(float x, float& coeff1, float& coeff2, float& coeff3, float& coeff4) {
    const float A = -0.75f;
    coeff1 = A*x*x*x-2*A*x*x+A*x;
    coeff2 = (A+2)*x*x*x-(A+3)*x*x+1;
    coeff3 = -(A+2)*x*x*x+(2*A-3)*x*x-A*x;
    coeff4 = -A*x*x*x+A*x*x;
}

static void _init_1d_cubic_interpolate_grid(int gridsize, std::vector<float>& coeffs1, std::vector<float>& coeffs2, std::vector<float>& coeffs3, std::vector<float>& coeffs4) {
    float scale = 1.f / gridsize;
    coeffs1.resize(gridsize);
    coeffs2.resize(gridsize);
    coeffs3.resize(gridsize);
    coeffs4.resize(gridsize);
    for (int i = 0; i < gridsize; i++) {
        float x = i * scale;
        float coeff1, coeff2, coeff3, coeff4;
        _cubic_interpolate_coeffs(x, coeff1, coeff2, coeff3, coeff4);
        coeffs1[i] = coeff1;
        coeffs2[i] = coeff2;
        coeffs3[i] = coeff3;
        coeffs4[i] = coeff4;
    }
}

static std::vector<std::vector<cv::Mat>> _init_2d_cubic_interpolate_grid() {
    int gridsize = cv::INTER_TAB_SIZE;  // 32
    std::vector<float> coeffs1, coeffs2, coeffs3, coeffs4;
    _init_1d_cubic_interpolate_grid(gridsize, coeffs1, coeffs2, coeffs3, coeffs4);
    std::vector<std::vector<float>> coeffs;
    coeffs.push_back(coeffs1);
    coeffs.push_back(coeffs2);
    coeffs.push_back(coeffs3);
    coeffs.push_back(coeffs4);
    
    int bits = 15;
    int scale = (1 << bits);
    
    int ksize = 4;
    std::vector<std::vector<cv::Mat>> grids;
    for (int i = 0; i < gridsize; i++) {
        std::vector<cv::Mat> gridrow;
        for (int j = 0; j < gridsize; j++) {
            int isum = 0;
            cv::Mat grid(ksize, ksize, CV_16SC1);
            
            for (int k1 = 0; k1 < ksize; k1++) {
                float vy = coeffs[k1][i];
                for (int k2 = 0; k2 < ksize; k2++) {
                    float vx = coeffs[k2][j];
                    float v = vy * vx;
                    short val = cv::saturate_cast<short>(v * scale);
                    grid.ptr<short>(k1)[k2] = val;
                    isum += val;
                }
            }
            
            if (isum != scale) {
                int diff = isum - scale;
                
                int ksize2 = ksize / 2;
                int maxK1 = ksize2, maxK2 = ksize2, minK1 = ksize2, minK2 = ksize2;
                for (int k1 = ksize2; k1 < ksize2+2; k1++) {
                    for (int k2 = ksize2; k2 < ksize2+2; k2++) {
                        if (grid.ptr<short>(k1)[k2] < grid.ptr<short>(minK1)[minK2]) {
                            minK1 = k1;
                            minK2 = k2;
                        } else if (grid.ptr<short>(k1)[k2] > grid.ptr<short>(maxK1)[maxK2]) {
                            maxK1 = k1;
                            maxK2 = k2;
                        }
                    }
                }
                
                if (diff < 0) {
                    short val = grid.ptr<short>(maxK1)[maxK2];
                    grid.ptr<short>(maxK1)[maxK2] = (short)(val - diff);
                } else {
                    short val = grid.ptr<short>(minK1)[minK2];
                    grid.ptr<short>(minK1)[minK2] = (short)(val - diff);
                }
            }
            gridrow.push_back(grid);
        }
        grids.push_back(gridrow);
    }
    
    return grids;
}

static void _remap_cubic(const cv::Mat& src, cv::Mat& dst, const cv::Mat& mapx, const cv::Mat& mapy, const std::vector<std::vector<cv::Mat>>& grids, const cv::Mat& gridy_indices, const cv::Mat& gridx_indices, int borderType, uchar borderValue) {
    CV_Assert(src.type() == CV_8UC1);
    CV_Assert(mapx.type() == CV_16SC1);
    CV_Assert(mapy.type() == CV_16SC1);
    CV_Assert(gridy_indices.type() == CV_16UC1);
    CV_Assert(gridx_indices.type() == CV_16UC1);
    
    const int KSIZE = 4;
    for (int i = 0; i < dst.rows; i++) {
        uchar* pdst = dst.ptr<uchar>(i);
        const short* px = mapx.ptr<short>(i);
        const short* py = mapy.ptr<short>(i);
        
        for (int j = 0; j < dst.cols; j++) {
            int sx = (int)px[j]-1;
            int sy = (int)py[j]-1;
            ushort gridy_index = gridy_indices.ptr<ushort>(i)[j];
            ushort gridx_index = gridx_indices.ptr<ushort>(i)[j];
            cv::Mat w = grids[gridy_index][gridx_index];
            
            if ((unsigned)sx < src.cols - (KSIZE-1) && (unsigned)sy < src.rows - (KSIZE-1)) {
                int isum = 0;
                for (int ky = 0; ky < KSIZE; ky++) {
                    for (int kx = 0; kx < KSIZE; kx++) {
                        isum += int(w.ptr<short>(ky)[kx] * src.ptr<uchar>(sy+ky)[sx+kx]);
                    }
                }
                pdst[j] = _fixed_pt_cast(isum);
            } else {
                if (borderType == cv::BORDER_CONSTANT && (sx >= src.cols || sx + KSIZE <= 0 || sy >= src.rows || sy + KSIZE <= 0)) {
                    pdst[j] = borderValue;
                    continue;
                }
                int isum = 0;
                int sxs[KSIZE], sys[KSIZE];
                for (int ks = 0; ks < KSIZE; ks++) {
                    sxs[ks] = _border_interpolate(sx+ks, src.cols, borderType);
                    sys[ks] = _border_interpolate(sy+ks, src.rows, borderType);
                }
                for (int ky = 0; ky < KSIZE; ky++) {
                    int yky = sys[ky];
                    
                    for (int kx = 0; kx < KSIZE; kx++) {
                        int xkx = sxs[kx];
                        if (yky < 0) {
                            isum += borderValue * w.ptr<short>(ky)[kx];
                            continue;
                        }
                        if (xkx < 0) {
                            isum += borderValue * w.ptr<short>(ky)[kx];
                            continue;
                        }
                        isum += int(src.ptr<uchar>(yky)[xkx] * w.ptr<short>(ky)[kx]);
                    }
                }
                pdst[j] = _fixed_pt_cast(isum);
            }
        }
    }
}

static void _lanczos4_interpolate_coeffs(float x, float& coeff1, float& coeff2, float& coeff3, float& coeff4, float& coeff5, float& coeff6, float& coeff7, float& coeff8) {
    if (x < FLT_EPSILON) {
        coeff1 = coeff2 = coeff3 = coeff5 = coeff6 = coeff7 = coeff8 = 0;
        coeff4 = 1;
        return;
    }
    double x1 = -(x+3-0) * CV_PI;
    double x2 = -(x+3-1) * CV_PI;
    double x3 = -(x+3-2) * CV_PI;
    double x4 = -(x+3-3) * CV_PI;
    double x5 = -(x+3-4) * CV_PI;
    double x6 = -(x+3-5) * CV_PI;
    double x7 = -(x+3-6) * CV_PI;
    double x8 = -(x+3-7) * CV_PI;
    coeff1 = 4 * std::sin(x1) * std::sin(x1/4) / (x1*x1);
    coeff2 = 4 * std::sin(x2) * std::sin(x2/4) / (x2*x2);
    coeff3 = 4 * std::sin(x3) * std::sin(x3/4) / (x3*x3);
    coeff4 = 4 * std::sin(x4) * std::sin(x4/4) / (x4*x4);
    coeff5 = 4 * std::sin(x5) * std::sin(x5/4) / (x5*x5);
    coeff6 = 4 * std::sin(x6) * std::sin(x6/4) / (x6*x6);
    coeff7 = 4 * std::sin(x7) * std::sin(x7/4) / (x7*x7);
    coeff8 = 4 * std::sin(x8) * std::sin(x8/4) / (x8*x8);
    float sum = coeff1 + coeff2 + coeff3 + coeff4 + coeff5 + coeff6 + coeff7 + coeff8;
    coeff1 /= sum;
    coeff2 /= sum;
    coeff3 /= sum;
    coeff4 /= sum;
    coeff5 /= sum;
    coeff6 /= sum;
    coeff7 /= sum;
    coeff8 /= sum;
}

static void _init_1d_lanczos4_interpolate_grid(int gridsize, std::vector<float>& coeffs1, std::vector<float>& coeffs2, std::vector<float>& coeffs3, std::vector<float>& coeffs4, std::vector<float>& coeffs5, std::vector<float>& coeffs6, std::vector<float>& coeffs7, std::vector<float>& coeffs8) {
    float scale = 1.f / gridsize;
    coeffs1.resize(gridsize);
    coeffs2.resize(gridsize);
    coeffs3.resize(gridsize);
    coeffs4.resize(gridsize);
    coeffs5.resize(gridsize);
    coeffs6.resize(gridsize);
    coeffs7.resize(gridsize);
    coeffs8.resize(gridsize);
    for (int i = 0; i < gridsize; i++) {
        float x = i * scale;
        float coeff1, coeff2, coeff3, coeff4, coeff5, coeff6, coeff7, coeff8;
        _lanczos4_interpolate_coeffs(x, coeff1, coeff2, coeff3, coeff4, coeff5, coeff6, coeff7, coeff8);
        coeffs1[i] = coeff1;
        coeffs2[i] = coeff2;
        coeffs3[i] = coeff3;
        coeffs4[i] = coeff4;
        coeffs5[i] = coeff5;
        coeffs6[i] = coeff6;
        coeffs7[i] = coeff7;
        coeffs8[i] = coeff8;
    }
}

static std::vector<std::vector<cv::Mat>> _init_2d_lanczos4_interpolate_grid() {
    int gridsize = cv::INTER_TAB_SIZE;  // 32
    std::vector<float> coeffs1, coeffs2, coeffs3, coeffs4, coeffs5, coeffs6, coeffs7, coeffs8;
    _init_1d_lanczos4_interpolate_grid(gridsize, coeffs1, coeffs2, coeffs3, coeffs4, coeffs5, coeffs6, coeffs7, coeffs8);
    std::vector<std::vector<float>> coeffs;
    coeffs.push_back(coeffs1);
    coeffs.push_back(coeffs2);
    coeffs.push_back(coeffs3);
    coeffs.push_back(coeffs4);
    coeffs.push_back(coeffs5);
    coeffs.push_back(coeffs6);
    coeffs.push_back(coeffs7);
    coeffs.push_back(coeffs8);
    
    int bits = 15;
    int scale = (1 << bits);
    
    int ksize = 8;
    std::vector<std::vector<cv::Mat>> grids;
    for (int i = 0; i < gridsize; i++) {
        std::vector<cv::Mat> gridrow;
        for (int j = 0; j < gridsize; j++) {
            int isum = 0;
            cv::Mat grid(ksize, ksize, CV_16SC1);
            
            for (int k1 = 0; k1 < ksize; k1++) {
                float vy = coeffs[k1][i];
                for (int k2 = 0; k2 < ksize; k2++) {
                    float vx = coeffs[k2][j];
                    float v = vy * vx;
                    short val = cv::saturate_cast<short>(v * scale);
                    grid.ptr<short>(k1)[k2] = val;
                    isum += val;
                }
            }
            
            if (isum != scale) {
                int diff = isum - scale;
                
                int ksize2 = ksize / 2;
                int maxK1 = ksize2, maxK2 = ksize2, minK1 = ksize2, minK2 = ksize2;
                for (int k1 = ksize2; k1 < ksize2+2; k1++) {
                    for (int k2 = ksize2; k2 < ksize2+2; k2++) {
                        if (grid.ptr<short>(k1)[k2] < grid.ptr<short>(minK1)[minK2]) {
                            minK1 = k1;
                            minK2 = k2;
                        } else if (grid.ptr<short>(k1)[k2] > grid.ptr<short>(maxK1)[maxK2]) {
                            maxK1 = k1;
                            maxK2 = k2;
                        }
                    }
                }
                
                if (diff < 0) {
                    short val = grid.ptr<short>(maxK1)[maxK2];
                    grid.ptr<short>(maxK1)[maxK2] = (short)(val - diff);
                } else {
                    short val = grid.ptr<short>(minK1)[minK2];
                    grid.ptr<short>(minK1)[minK2] = (short)(val - diff);
                }
            }
            gridrow.push_back(grid);
        }
        grids.push_back(gridrow);
    }
    
    return grids;
}

static void _remap_lanczos4(const cv::Mat& src, cv::Mat& dst, const cv::Mat& mapx, const cv::Mat& mapy, const std::vector<std::vector<cv::Mat>>& grids, const cv::Mat& gridy_indices, const cv::Mat& gridx_indices, int borderType, uchar borderValue) {
    CV_Assert(src.type() == CV_8UC1);
    CV_Assert(mapx.type() == CV_16SC1);
    CV_Assert(mapy.type() == CV_16SC1);
    CV_Assert(gridy_indices.type() == CV_16UC1);
    CV_Assert(gridx_indices.type() == CV_16UC1);
    
    const int KSIZE = 8;
    for (int i = 0; i < dst.rows; i++) {
        uchar* pdst = dst.ptr<uchar>(i);
        const short* px = mapx.ptr<short>(i);
        const short* py = mapy.ptr<short>(i);
        
        for (int j = 0; j < dst.cols; j++) {
            int sx = (int)px[j]-3;
            int sy = (int)py[j]-3;
            ushort gridy_index = gridy_indices.ptr<ushort>(i)[j];
            ushort gridx_index = gridx_indices.ptr<ushort>(i)[j];
            cv::Mat w = grids[gridy_index][gridx_index];
            
            if ((unsigned)sx < src.cols - (KSIZE-1) && (unsigned)sy < src.rows - (KSIZE-1)) {
                int isum = 0;
                for (int ky = 0; ky < KSIZE; ky++) {
                    for (int kx = 0; kx < KSIZE; kx++) {
                        isum += int(w.ptr<short>(ky)[kx] * src.ptr<uchar>(sy+ky)[sx+kx]);
                    }
                }
                pdst[j] = _fixed_pt_cast(isum);
            } else {
                if (borderType == cv::BORDER_CONSTANT && (sx >= src.cols || sx + KSIZE <= 0 || sy >= src.rows || sy + KSIZE <= 0)) {
                    pdst[j] = borderValue;
                    continue;
                }
                int isum = 0;
                int sxs[KSIZE], sys[KSIZE];
                for (int ks = 0; ks < KSIZE; ks++) {
                    sxs[ks] = _border_interpolate(sx+ks, src.cols, borderType);
                    sys[ks] = _border_interpolate(sy+ks, src.rows, borderType);
                }
                for (int ky = 0; ky < KSIZE; ky++) {
                    int yky = sys[ky];
                    
                    for (int kx = 0; kx < KSIZE; kx++) {
                        int xkx = sxs[kx];
                        if (yky < 0) {
                            isum += borderValue * w.ptr<short>(ky)[kx];
                            continue;
                        }
                        if (xkx < 0) {
                            isum += borderValue * w.ptr<short>(ky)[kx];
                            continue;
                        }
                        isum += int(src.ptr<uchar>(yky)[xkx] * w.ptr<short>(ky)[kx]);
                    }
                }
                pdst[j] = _fixed_pt_cast(isum);
            }
        }
    }
}

static cv::Mat _remap(const cv::Mat& src, const cv::Mat& mapx, const cv::Mat& mapy, int interpolation, int borderType, uchar borderValue) {
    CV_Assert(src.type() == CV_8UC1);
    CV_Assert(mapx.type() == CV_32FC1);
    CV_Assert(mapy.type() == CV_32FC1);
    
    cv::Mat dst(src.size(), CV_8UC1);
    
    if (interpolation == cv::INTER_NEAREST) {
        _remap_nearest(src, dst, mapx, mapy, borderType, borderValue);
    } else {
        cv::Mat mapx_short(mapx.size(), CV_16SC1);
        cv::Mat mapy_short(mapy.size(), CV_16SC1);
        cv::Mat gridy_indices(mapx.size(), CV_16UC1);
        cv::Mat gridx_indices(mapy.size(), CV_16UC1);
        int gridsize = cv::INTER_TAB_SIZE;  // 32
        for (int i = 0; i < mapx.rows; i++) {
            for (int j = 0; j < mapx.cols; j++) {
                const float sx = mapx.ptr<float>(i)[j];
                const float sy = mapy.ptr<float>(i)[j];
                int xx = cvRound(sx * cv::INTER_TAB_SIZE);
                int yy = cvRound(sy * cv::INTER_TAB_SIZE);
                mapx_short.ptr<short>(i)[j] = cv::saturate_cast<short>(xx >> cv::INTER_BITS);  // 5
                mapy_short.ptr<short>(i)[j] = cv::saturate_cast<short>(yy >> cv::INTER_BITS);
                gridy_indices.ptr<ushort>(i)[j] = ((ushort)(yy & (gridsize-1)));
                gridx_indices.ptr<ushort>(i)[j] = ((ushort)(xx & (gridsize-1)));
            }
        }
        
        if (interpolation == cv::INTER_LINEAR) {
            _remap_bilinar(src, dst, mapx_short, mapy_short, _init_2d_linear_interpolate_grid(), gridy_indices, gridx_indices, borderType, borderValue);
        } else if (interpolation == cv::INTER_CUBIC) {
            _remap_cubic(src, dst, mapx_short, mapy_short, _init_2d_cubic_interpolate_grid(), gridy_indices, gridx_indices, borderType, borderValue);
        } else if (interpolation == cv::INTER_LANCZOS4) {
            _remap_lanczos4(src, dst, mapx_short, mapy_short, _init_2d_lanczos4_interpolate_grid(), gridy_indices, gridx_indices, borderType, borderValue);
        }
    }
    
    return dst;
}

int main(int argc, char **argv) {
    /*
    cv::Mat mat = (cv::Mat_<uchar>(3, 3) << 1, 2, 3, 4, 5, 6, 7, 8, 9);
    cv::Mat mx = (cv::Mat_<float>(3, 3) << 1, 1, 1, 1, 1, 1, 1, 1, 1);
    cv::Mat my = (cv::Mat_<float>(3, 3) << 0, 1, 2, 0, 1, 2, 0, 1, 2);
    cv::Mat dst;
    cv::remap(mat, dst, mx, my, cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(128));
    cv::Mat dst2 = _my_simple_remap(mat, mx, my);
    std::cout << "mat = \n" << mat << std::endl;
    std::cout << "dst = \n" << dst << std::endl;
    std::cout << "dst2 = \n" << dst2 << std::endl;
    */
    /*
    cv::Mat src = cv::imread("/home/xlll/Downloads/opencv/samples/data/lena.jpg", cv::IMREAD_UNCHANGED);
    if (src.empty()) {
        std::cout << "failed to read image" << std::endl;
        return EXIT_FAILURE;
    }
    
    cv::Mat mapx(src.size(), CV_32FC1), mapy(src.size(), CV_32FC1);
    
    // horizontal flip
    for (int i = 0; i < src.rows; i++) {
        for (int j = 0; j < src.cols; j++) {
            mapx.ptr<float>(i)[j] = src.cols-j-1;
            mapy.ptr<float>(i)[j] = i;
        }
    }
    cv::Mat hflip;
    cv::remap(src, hflip, mapx, mapy, cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(128));
    
    cv::Mat mapxy(src.size(), CV_32FC2);
    for (int i = 0; i < src.rows; i++) {
        for (int j = 0; j < src.cols; j++) {
            mapxy.ptr<cv::Vec2f>(i)[j][0] = src.cols-j-1;
            mapxy.ptr<cv::Vec2f>(i)[j][1] = i;
        }
    }
    cv::Mat hflip1;
    cv::remap(src, hflip1, mapxy, cv::Mat(), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(128));
    
    std::vector<cv::Mat> hflips, hflip1s;
    cv::split(hflip, hflips);
    cv::split(hflip1, hflip1s);
    cv::Mat diff1 = hflips[0] != hflip1s[0];
    cv::Mat diff2 = hflips[1] != hflip1s[1];
    cv::Mat diff3 = hflips[2] != hflip1s[2];
    std::vector<cv::Point> nonzeros1, nonzeros2, nonzeros3;
    cv::findNonZero(diff1, nonzeros1);
    cv::findNonZero(diff2, nonzeros2);
    cv::findNonZero(diff3, nonzeros3);
    std::cout << "nonzeros1 size = " << nonzeros1.size() << std::endl;
    std::cout << "nonzeros2 size = " << nonzeros2.size() << std::endl;
    std::cout << "nonzeros3 size = " << nonzeros3.size() << std::endl;
    
    // vertical flip
    for (int i = 0; i < src.rows; i++) {
        for (int j = 0; j < src.cols; j++) {
            mapx.ptr<float>(i)[j] = j;
            mapy.ptr<float>(i)[j] = src.rows - i - 1;
        }
    }
    cv::Mat vflip;
    cv::remap(src, vflip, mapx, mapy, cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(128));
    
    // horizontal and vertical flip
    for (int i = 0; i < src.rows; i++) {
        for (int j = 0; j < src.cols; j++) {
            mapx.ptr<float>(i)[j] = src.cols - j - 1;
            mapy.ptr<float>(i)[j] = src.rows - i - 1;
        }
    }
    cv::Mat hvflip;
    cv::remap(src, hvflip, mapx, mapy, cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(128));
    
    // zoom out
    for (int i = 0; i < src.rows; i++) {
        for (int j = 0; j < src.cols; j++) {
            if (i > src.rows*0.25 && i < src.rows*0.75 && j < src.cols*0.75 && j > src.cols*0.25) {
                mapx.ptr<float>(i)[j] = 2*(j-src.cols*0.25);
                mapy.ptr<float>(i)[j] = 2*(i-src.rows*0.25);
            } else {
                mapx.ptr<float>(i)[j] = 0;
                mapy.ptr<float>(i)[j] = 0;
            }
        }
    }
    cv::Mat zoomout;
    cv::remap(src, zoomout, mapx, mapy, cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(128));
    
    // zoom in
    for (int i = 0; i < src.rows; i++) {
        for (int j = 0; j < src.cols; j++) {
            mapx.ptr<float>(i)[j] = j / 2. + src.cols / 4.;
            mapy.ptr<float>(i)[j] = i / 2. + src.rows / 4.;
        }
    }
    cv::Mat zoomin;
    cv::remap(src, zoomin, mapx, mapy, cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(128));
    
    cv::imshow("src", src);
    cv::imshow("hflip", hflip);
    cv::imshow("hflip1", hflip1);
    cv::imshow("vflip", vflip);
    cv::imshow("hvflip", hvflip);
    cv::imshow("zoomout", zoomout);
    cv::imshow("zoomin", zoomin);
    */
    
    cv::Mat src = cv::imread("/home/xlll/Downloads/opencv/samples/data/lena.jpg", cv::IMREAD_GRAYSCALE);
    if (src.empty()) {
        std::cout << "failed to read image" << std::endl;
        return EXIT_FAILURE;
    }
    
    cv::Mat mapx(src.size(), CV_32FC1), mapy(src.size(), CV_32FC1);
    for (int i = 0; i < src.rows; i++) {
        for (int j = 0; j < src.cols; j++) {
            mapx.ptr<float>(i)[j] = j + src.cols / 2;
            mapy.ptr<float>(i)[j] = i + src.rows / 2;
        }
    }
    cv::Mat dst1, dst2, dst3, dst4, dst5;
    cv::Mat dst11, dst22, dst33, dst44, dst55;
    /*
    cv::remap(src, dst1, mapx, mapy, cv::INTER_NEAREST, cv::BORDER_CONSTANT, cv::Scalar(255));
    cv::remap(src, dst2, mapx, mapy, cv::INTER_NEAREST, cv::BORDER_REPLICATE, cv::Scalar(255));
    cv::remap(src, dst3, mapx, mapy, cv::INTER_NEAREST, cv::BORDER_REFLECT, cv::Scalar(255));
    cv::remap(src, dst4, mapx, mapy, cv::INTER_NEAREST, cv::BORDER_REFLECT_101, cv::Scalar(255));
    cv::remap(src, dst5, mapx, mapy, cv::INTER_NEAREST, cv::BORDER_WRAP, cv::Scalar(255));
    
    dst11 = _remap(src, mapx, mapy, cv::INTER_NEAREST, cv::BORDER_CONSTANT, 255);
    dst22 = _remap(src, mapx, mapy, cv::INTER_NEAREST, cv::BORDER_REPLICATE, 255);
    dst33 = _remap(src, mapx, mapy, cv::INTER_NEAREST, cv::BORDER_REFLECT, 255);
    dst44 = _remap(src, mapx, mapy, cv::INTER_NEAREST, cv::BORDER_REFLECT_101, 255);
    dst55 = _remap(src, mapx, mapy, cv::INTER_NEAREST, cv::BORDER_WRAP, 255);
    */
    /*
    cv::remap(src, dst1, mapx, mapy, cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar(255));
    cv::remap(src, dst2, mapx, mapy, cv::INTER_LINEAR, cv::BORDER_REPLICATE, cv::Scalar(255));
    cv::remap(src, dst3, mapx, mapy, cv::INTER_LINEAR, cv::BORDER_REFLECT, cv::Scalar(255));
    cv::remap(src, dst4, mapx, mapy, cv::INTER_LINEAR, cv::BORDER_REFLECT_101, cv::Scalar(255));
    cv::remap(src, dst5, mapx, mapy, cv::INTER_LINEAR, cv::BORDER_WRAP, cv::Scalar(255));
    
    dst11 = _remap(src, mapx, mapy, cv::INTER_LINEAR, cv::BORDER_CONSTANT, 255);
    dst22 = _remap(src, mapx, mapy, cv::INTER_LINEAR, cv::BORDER_REPLICATE, 255);
    dst33 = _remap(src, mapx, mapy, cv::INTER_LINEAR, cv::BORDER_REFLECT, 255);
    dst44 = _remap(src, mapx, mapy, cv::INTER_LINEAR, cv::BORDER_REFLECT_101, 255);
    dst55 = _remap(src, mapx, mapy, cv::INTER_LINEAR, cv::BORDER_WRAP, 255);
    */
    /*
    cv::remap(src, dst1, mapx, mapy, cv::INTER_CUBIC, cv::BORDER_CONSTANT, cv::Scalar(255));
    cv::remap(src, dst2, mapx, mapy, cv::INTER_CUBIC, cv::BORDER_REPLICATE, cv::Scalar(255));
    cv::remap(src, dst3, mapx, mapy, cv::INTER_CUBIC, cv::BORDER_REFLECT, cv::Scalar(255));
    cv::remap(src, dst4, mapx, mapy, cv::INTER_CUBIC, cv::BORDER_REFLECT_101, cv::Scalar(255));
    cv::remap(src, dst5, mapx, mapy, cv::INTER_CUBIC, cv::BORDER_WRAP, cv::Scalar(255));
    
    dst11 = _remap(src, mapx, mapy, cv::INTER_CUBIC, cv::BORDER_CONSTANT, 255);
    dst22 = _remap(src, mapx, mapy, cv::INTER_CUBIC, cv::BORDER_REPLICATE, 255);
    dst33 = _remap(src, mapx, mapy, cv::INTER_CUBIC, cv::BORDER_REFLECT, 255);
    dst44 = _remap(src, mapx, mapy, cv::INTER_CUBIC, cv::BORDER_REFLECT_101, 255);
    dst55 = _remap(src, mapx, mapy, cv::INTER_CUBIC, cv::BORDER_WRAP, 255);
    */
    cv::remap(src, dst1, mapx, mapy, cv::INTER_LANCZOS4, cv::BORDER_CONSTANT, cv::Scalar(255));
    cv::remap(src, dst2, mapx, mapy, cv::INTER_LANCZOS4, cv::BORDER_REPLICATE, cv::Scalar(255));
    cv::remap(src, dst3, mapx, mapy, cv::INTER_LANCZOS4, cv::BORDER_REFLECT, cv::Scalar(255));
    cv::remap(src, dst4, mapx, mapy, cv::INTER_LANCZOS4, cv::BORDER_REFLECT_101, cv::Scalar(255));
    cv::remap(src, dst5, mapx, mapy, cv::INTER_LANCZOS4, cv::BORDER_WRAP, cv::Scalar(255));
    
    dst11 = _remap(src, mapx, mapy, cv::INTER_LANCZOS4, cv::BORDER_CONSTANT, 255);
    dst22 = _remap(src, mapx, mapy, cv::INTER_LANCZOS4, cv::BORDER_REPLICATE, 255);
    dst33 = _remap(src, mapx, mapy, cv::INTER_LANCZOS4, cv::BORDER_REFLECT, 255);
    dst44 = _remap(src, mapx, mapy, cv::INTER_LANCZOS4, cv::BORDER_REFLECT_101, 255);
    dst55 = _remap(src, mapx, mapy, cv::INTER_LANCZOS4, cv::BORDER_WRAP, 255);
    
    std::cout << (_is_same(dst1, dst11) ? "constant: same" : "constant: not same") << std::endl;
    std::cout << (_is_same(dst2, dst22) ? "replicate: same" : "replicate: not same") << std::endl;
    std::cout << (_is_same(dst3, dst33) ? "reflect: same" : "reflect: not same") << std::endl;
    std::cout << (_is_same(dst4, dst44) ? "reflect_101: same" : "reflect_101: not same") << std::endl;
    std::cout << (_is_same(dst5, dst55) ? "wrap: same" : "wrap: not same") << std::endl;
    
    cv::imshow("src", src);
    cv::imshow("dst1_constant", dst1);
    cv::imshow("dst2_replicate", dst2);
    cv::imshow("dst3_reflect", dst3);
    cv::imshow("dst4_reflect_101", dst4);
    cv::imshow("dst5_wrap", dst5);
    
    cv::waitKey(0);
    
    return 0;
}
